import os, datetime
os.makedirs("logs", exist_ok=True)
with open("logs/_ping.txt", "w", encoding="utf-8") as f:
f.write("hello @ " + str(datetime.datetime.now()) + "\n")
print("WROTE: logs/_ping.txt")Machine Learning Models for Geographic and Remote Work Analysis
KMeans Clustering, Salary Prediction, and Remote Work Classification
x
1 Introduction
2 Data Cleaning & Preprocessing
title: “Data Analysis” subtitle: “Comprehensive Data Cleaning & Exploratory Analysis” author: - name: “Bingrui Qiao” - name: “Zhengyu Zhou” - name: “Junhao Wang” affiliations: “Boston University” bibliography: references.bib csl: csl/econometrica.csl format: html: toc: true number-sections: true execute: echo: true eval: true freeze: false error: false cache: false enabled: !expr (os.getenv(“CI”, “false”) != “true”) jupyter: python3 —
2.1 Data Cleaning & Preprocessing
In this section, we clean the Lightcast dataset, log each step, and save a reproducible cleaned CSV for downstream EDA.
2.1.1 Setup & Load (clean version)
import os, datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import subprocess
import seaborn as sns
# Handle missingno (optional)
try:
import missingno as msno
HAS_MSNO = True
except ImportError:
HAS_MSNO = False
# Paths
DATA_PATH = "data/lightcast_job_postings.csv"
CLEAN_PATH = "data/cleaned_lightcast.csv"
LOG_PATH = "logs/cleaning_log.txt"
FIG_MISS = "figures/missing_values_heatmap.png"
# Ensure output dirs exist
os.makedirs("logs", exist_ok=True)
os.makedirs("figures", exist_ok=True)
# Logger
def log(msg: str):
print(msg)
with open(LOG_PATH, "a", encoding="utf-8") as f:
f.write(msg.rstrip() + "\n")
# Start a fresh log
with open(LOG_PATH, "w", encoding="utf-8") as f:
f.write("=== DATA CLEANING LOG START ===\n")
# Ping file (to confirm write permission)
with open("logs/_ping.txt", "w", encoding="utf-8") as f:
f.write("hello from python @ " + str(datetime.datetime.now()) + "\n")
# AUTO-DOWNLOAD IF MISSING
if not os.path.exists(DATA_PATH):
gdrive_url = "https://drive.google.com/uc?id=1V2GCHGt2dkFGqVBeoUFckU4IhUgk4ocQ"
try:
import gdown
gdown.download(gdrive_url, DATA_PATH, quiet=False)
except Exception as e:
raise FileNotFoundError(f"Could not download dataset.\nError: {e}")
# Load data
if not os.path.exists(DATA_PATH):
raise FileNotFoundError(f"Dataset not found at {DATA_PATH}. Check path & working dir.")
df = pd.read_csv(DATA_PATH, low_memory=False, on_bad_lines="skip")
log(f"Loaded dataset → rows: {len(df):,}, cols: {df.shape[1]}")2.1.2 Drop redundant/irrelevant columns
columns_to_drop = [
"ID","URL","ACTIVE_URLS","DUPLICATES","LAST_UPDATED_TIMESTAMP",
"NAICS2","NAICS3","NAICS4","NAICS5","NAICS6",
"SOC_2","SOC_3","SOC_5"
]
before_cols = df.shape[1]
df.drop(columns=columns_to_drop, inplace=True, errors="ignore")
after_cols = df.shape[1]
log(f"Dropped {before_cols - after_cols} columns; remaining columns: {after_cols}")
# Normalize names & basic types
df.columns = df.columns.str.strip().str.upper()
if "POSTED" in df.columns:
df["POSTED"] = pd.to_datetime(df["POSTED"], errors="coerce")
if "SALARY" in df.columns:
df["SALARY"] = pd.to_numeric(
df["SALARY"].astype(str).str.replace(r"[^0-9.-]", "", regex=True),
errors="coerce"
)2.1.3 Visualize & handle missing values
# --- Missing values correlation heatmap (like your classmate) ---
cols_with_na = [c for c in df.columns if df[c].isna().any()]
sub = df[cols_with_na]
if len(cols_with_na) >= 2:
if len(cols_with_na) > 25:
top_na_cols = sub.isna().mean().sort_values(ascending=False).head(25).index
sub = sub[top_na_cols]
na_corr = sub.isna().astype(int).corr()
mask = np.triu(np.ones_like(na_corr, dtype=bool))
plt.figure(figsize=(11, 8))
sns.heatmap(
na_corr, mask=mask, cmap="coolwarm", vmin=-1, vmax=1,
annot=True, fmt=".1f", linewidths=.5,
cbar_kws={"shrink": .8}
)
plt.title("Missing Values Correlation Heatmap", fontsize=14)
plt.xticks(rotation=45, ha="right", fontsize=8)
plt.yticks(rotation=0, fontsize=8)
plt.tight_layout()
FIG_MISS = "figures/missing_values_corr_heatmap.png"
plt.savefig(FIG_MISS, dpi=150)
plt.show()
else:
log("No columns with missing values; skipping heatmap.")
2.1.4 Remove duplicates
subset_cols = [c for c in ["TITLE","COMPANY","LOCATION","POSTED"] if c in df.columns]
before = len(df)
if subset_cols:
df.drop_duplicates(subset=subset_cols, keep="first", inplace=True)
after = len(df)
log(f"Removed duplicates by {subset_cols}: {before - after} rows dropped; remaining: {after}")2.1.5 Optional salary sanity filter
if "SALARY" in df.columns:
bad = (df["SALARY"] < 1) | (df["SALARY"] > 1_000_000)
n_bad = int(bad.sum())
if n_bad > 0:
df.loc[bad, "SALARY"] = np.nan
med2 = df["SALARY"].median()
df["SALARY"].fillna(med2, inplace=True)
log(f"Clipped extreme Salary ({n_bad} rows) and refilled with median {med2:.2f}")2.1.6 Save & summary
df.to_csv(CLEAN_PATH, index=False)
log(f"Saved cleaned dataset → {CLEAN_PATH}")
summary = f"Rows: {len(df):,}\nColumns: {df.shape[1]}\nSample columns: {list(df.columns)[:12]}"
print(summary)
log("✅ Cleaning pipeline finished successfully.")2.2 Exploratory Data Analysis (EDA)
Exploratory Data Analysis (EDA) allows us to identify patterns and distributions in the job market dataset.
In this section, we focus on three aspects: 1. Job postings by industry
2. Salary distributions
3. Remote vs. on-site job proportions
2.2.1 Job Postings by Industry
Understanding industry demand helps reveal which sectors are most active in hiring.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
# Load dataset
df = pd.read_csv("data/cleaned_lightcast.csv", low_memory=False)
# Ensure /figures directory exists
os.makedirs("figures", exist_ok=True)
# col
industry_col = "NAICS_2022_6_NAME"
# Clean up column names
df.columns = df.columns.str.strip()
#drop unclassified
df = df[~df["NAICS_2022_6_NAME"].str.lower().str.contains("unclassified", na=False)]
# Count top 10 industries
top_industries = df[industry_col].value_counts().head(10)
plt.figure(figsize=(10, 6))
sns.barplot(x=top_industries.values, y=top_industries.index, orient="h")
plt.title("Top 10 Industries by Job Postings")
plt.xlabel("Number of Job Postings")
plt.ylabel("Industry")
plt.tight_layout()
plt.savefig("figures/industry_postings.png", dpi=300)
plt.show()
A bar chart was chosen to visualize the number of job postings across different industries. This format makes it easy to compare industry demand and helps job seekers understand which sectors have the highest hiring activity.
Job demand is concentrated in tech and professional services: Custom Computer Programming Services leads, followed closely by Management Consulting, Employment Placement Agencies, and Computer Systems Design—each with ~4–5k postings. After these, volumes drop sharply to a second tier (Commercial Banking, CPA offices, Temporary Help, Health/Medical Insurance, Other Computer Services, Colleges/Universities) at ~1.3–2.0k. The prominence of staffing/placement agencies signals broad, economy-wide hiring, while the tech-heavy top ranks highlight sustained demand for digital and knowledge-worker skills.
2.2.2 Salary Distribution by Industry
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("data/cleaned_lightcast.csv", low_memory=False)
# Define column names
industry_col = "NAICS_2022_6_NAME"
salary_col = "SALARY"
# Filter and clean salary
df = df[df[salary_col] > 0]
#df = df[~df[industry_col].str.lower().str.contains("unclassified")]
# Select top 10 industries by posting count
top10 = df[industry_col].value_counts().head(10).index
df_top10 = df[df[industry_col].isin(top10)]
# --- Boxplot Visualization ---
palette = sns.color_palette("Spectral", n_colors=10)
plt.figure(figsize=(12, 8))
sns.boxplot(
data=df_top10,
y=industry_col,
x=salary_col,
palette=palette,
showfliers=True, # show outlier
linewidth=1.2
)
# Add title & labels
plt.title("Salary Distribution by Industry (Top 10 Industries)", fontsize=18, weight="bold")
plt.xlabel("Salary (USD)", fontsize=14)
plt.ylabel("Industry", fontsize=14)
# Light grid background like example
sns.set_style("whitegrid")
plt.grid(axis="x", linestyle="--", alpha=0.3)
plt.tight_layout()
# Save figure
plt.savefig("figures/salary_distribution_by_industry_top10.png", dpi=300, bbox_inches="tight")
plt.show()/tmp/ipykernel_5950/3831417689.py:25: FutureWarning:
Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

Use a box chart to show the salary distribution of each major recruitment industry. Unlike bar charts, bar charts only show averages, while box charts can reveal the median, fluctuation range and abnormal values in each industry. This helps job seekers not only understand which industries have higher average salaries, but also the extent of salary fluctuations and the areas with the greatest potential for salary growth.
In various industries, some knowledge-intensive and service-intensive fields (such as unclassified industries and customized computer programming services) have relatively high salary levels and are more dispersed. The median salary in these areas is relatively high, and the upper tail is long, which indicates that there are a large number of highly paid positions. Management consulting, computer system design, and direct health and medical insurance industries also show a large range of remuneration, reflecting the wide range of remuneration within these professions. In contrast, industries such as colleges and universities, certified public accountants, commercial banks and software publishers have a tighter scope and fewer extreme outliers, indicating that the remuneration range of these industries is more standardized and there is less room for improvement at the top of the salary distribution.
2.2.3 Remote vs. On-Site Jobs
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
os.makedirs("figures", exist_ok=True)
df = pd.read_csv("data/cleaned_lightcast.csv", low_memory=False)
# Clean the REMOTE_TYPE_NAME column
df["REMOTE_TYPE_NAME"] = (
df["REMOTE_TYPE_NAME"]
.astype(str).str.strip().str.lower()
.replace({"[none]": None, "none": None, "unknown": None, "nan": None, "na": None, "null": None, "": None})
)
# Count each type
remote_counts = df["REMOTE_TYPE_NAME"].value_counts()
# Visual
plt.figure(figsize=(6, 6))
plt.pie(
remote_counts.values,
labels=remote_counts.index,
autopct="%1.1f%%",
startangle=90,
wedgeprops={"edgecolor": "white"}
)
plt.title("Remote vs. On-Site Job Distribution", fontsize=13)
plt.tight_layout()
plt.savefig("figures/remote_vs_onsite.png", dpi=300, bbox_inches="tight")
plt.show()
A pie chart was selected to display the proportions of job types (remote, hybrid, and on-site). This format offers an intuitive visual summary, helping job seekers understand the flexibility of job opportunities in 2024.
The job market here is overwhelmingly remote: nearly four out of five postings (78.4%) are fully remote, while another 14.4% offer hybrid options. Only 7.3% require full on-site work. This mix signals strong employer flexibility and broad access to roles regardless of location, with hybrid emerging as a meaningful—but secondary—model. For candidates, remote-first skills (self-management, async collaboration) are likely at a premium, while fully on-site opportunities are comparatively scarce.
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
os.makedirs("figures", exist_ok=True)
df = pd.read_csv("data/cleaned_lightcast.csv", low_memory=False)
# =========================================
# 1️⃣ Clean REMOTE_TYPE_NAME
# =========================================
df["REMOTE_TYPE_NAME"] = (
df["REMOTE_TYPE_NAME"]
.astype(str)
.str.strip()
.str.lower()
.replace({
"[none]": None,
"none": None,
"unknown": None,
"nan": None,
"na": None,
"null": None,
"": None
})
)
# =========================================
# 2️⃣ Re-group into Remote / Hybrid / Onsite
# =========================================
def map_remote(x):
if pd.isna(x):
return "Onsite"
if "remote" in x and "hybrid" in x:
return "Hybrid"
if x == "hybrid remote":
return "Hybrid"
if x == "remote":
return "Remote"
if x == "not remote":
return "Onsite"
return "Onsite"
df["REMOTE_GROUP"] = df["REMOTE_TYPE_NAME"].apply(map_remote)
print(df["REMOTE_GROUP"].value_counts())
# =========================================
# 3️⃣ Visualization (Pie Chart)
# =========================================
remote_counts = df["REMOTE_GROUP"].value_counts()
plt.figure(figsize=(6, 6))
plt.pie(
remote_counts.values,
labels=remote_counts.index,
autopct="%1.1f%%",
startangle=90,
wedgeprops={"edgecolor": "white"}
)
plt.title("Remote vs. Hybrid vs. Onsite Job Distribution", fontsize=13)
plt.tight_layout()
plt.savefig("figures/remote_vs_onsite.png", dpi=300, bbox_inches="tight")
plt.show()REMOTE_GROUP
Onsite 55304
Remote 11743
Hybrid 2151
Name: count, dtype: int64

3 Skill Gap Analysis
4 Create a team-based skill dataframe
import pandas as pd
skills_data = {
"Name": ["Bingrui Qiao", "Zhengyu Zhou", "Junhao Wang"],
"Python": [2, 2, 3],
"SQL": [2, 2, 3],
"Machine Learning": [1, 1, 1],
"Cloud Computing": [2, 2, 3],
"Docker": [1, 1, 1],
"AWS": [2, 3, 2]
}
df_skills = pd.DataFrame(skills_data)
df_skills.set_index("Name", inplace=True)
df_skills| Python | SQL | Machine Learning | Cloud Computing | Docker | AWS | |
|---|---|---|---|---|---|---|
| Name | ||||||
| Bingrui Qiao | 2 | 2 | 1 | 2 | 1 | 2 |
| Zhengyu Zhou | 2 | 2 | 1 | 2 | 1 | 3 |
| Junhao Wang | 3 | 3 | 1 | 3 | 1 | 2 |
5 Visualizing Skill Gaps
import seaborn as sns
import matplotlib.pyplot as plt
#visual
sns.set_theme(style="whitegrid", font_scale=1.1, rc={
"axes.facecolor": "white",
"figure.facecolor": "white",
"axes.edgecolor": "gray",
"grid.color": "lightgray"
})
# Color choice
cmap = sns.diverging_palette(220, 20, as_cmap=True)
# visual
plt.figure(figsize=(9, 6))
ax = sns.heatmap(
df_skills,
annot=True,
fmt=".0f",
cmap=cmap,
linewidths=0.7,
cbar_kws={'label': 'Skill Level'},
square=True
)
# Title
plt.title("Team Skill Levels Heatmap", fontsize=15, weight="bold", pad=20)
plt.xlabel("Skill Domain", fontsize=12)
plt.ylabel("Team Member", fontsize=12)
# layout
plt.xticks(rotation=30, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
#save
plt.savefig("figures/Skill_Gap_Analysis.png", dpi=300, bbox_inches="tight")
plt.show()
6 Compare skill
# Load data
df = pd.read_csv("data/cleaned_lightcast.csv")
# Count keyword occurrences
top_skills = df_skills.columns.tolist()
job_text = df["BODY"].fillna("")
skill_counts = {s: job_text.str.contains(s, case=False).sum() for s in top_skills}
# Append demand row
df_skills.loc["Market Demand"] = [skill_counts[s] for s in top_skills]
df_skills| Python | SQL | Machine Learning | Cloud Computing | Docker | AWS | |
|---|---|---|---|---|---|---|
| Name | ||||||
| Bingrui Qiao | 2 | 2 | 1 | 2 | 1 | 2 |
| Zhengyu Zhou | 2 | 2 | 1 | 2 | 1 | 3 |
| Junhao Wang | 3 | 3 | 1 | 3 | 1 | 2 |
| Market Demand | 11782 | 23202 | 3972 | 1302 | 688 | 14243 |
7 Visual heatmap
import seaborn as sns
import matplotlib.pyplot as plt
# Same style
sns.set_theme(style="whitegrid", font_scale=1.1, rc={
"axes.facecolor": "white",
"figure.facecolor": "white",
"axes.edgecolor": "gray",
"grid.color": "lightgray"
})
cmap = sns.diverging_palette(220, 20, as_cmap=True)
# Team Skill Levels
plt.figure(figsize=(8, 2.5))
plt.imshow(df_skills.iloc[:-1], aspect="auto", cmap=cmap)
plt.colorbar(label="Skill Level (1–5)")
plt.yticks(range(len(df_skills.index)-1), df_skills.index[:-1], fontsize=10)
plt.xticks(range(len(df_skills.columns)), df_skills.columns, rotation=30, ha="right", fontsize=9)
plt.title("Team Skill Levels", fontsize=13, weight="bold")
plt.tight_layout()
plt.savefig("figures/Team_Skill_Level.png", dpi=300, bbox_inches="tight")
plt.show()
# Market Demand Heatmap
plt.figure(figsize=(8, 2))
plt.imshow([df_skills.loc["Market Demand"]], aspect="auto", cmap=cmap)
plt.colorbar(label="Market Demand Count")
plt.yticks([0], ["Market Demand"], fontsize=10)
plt.xticks(range(len(df_skills.columns)), df_skills.columns, rotation=30, ha="right", fontsize=9)
plt.title("Market Demand", fontsize=13, weight="bold")
plt.tight_layout()
plt.savefig("figures/Market_Demand.png", dpi=300, bbox_inches="tight")
plt.show()

8 Machine Learning Models
9 Classification: Remote vs Non-Remote Jobs
# Import necessary libraries
import pandas as pd
import plotly.express as px
import plotly.io as pio
import numpy as np
# Set seed for reproducibility
np.random.seed(42)
# Set plotly renderer
pio.renderers.default = "notebook+notebook_connected+vscode"
# Initialize Spark Session
df = pd.read_csv("data/lightcast_job_postings.csv", low_memory=False)
# Print schema and preview first few rows
print("--- Diagnostic check: Schema and sample rows ---")
print(df.info())
print(df.head())--- Diagnostic check: Schema and sample rows ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72498 entries, 0 to 72497
Columns: 131 entries, ID to NAICS_2022_6_NAME
dtypes: float64(38), object(93)
memory usage: 72.5+ MB
None
ID LAST_UPDATED_DATE \
0 1f57d95acf4dc67ed2819eb12f049f6a5c11782c 9/6/2024
1 0cb072af26757b6c4ea9464472a50a443af681ac 8/2/2024
2 85318b12b3331fa490d32ad014379df01855c557 9/6/2024
3 1b5c3941e54a1889ef4f8ae55b401a550708a310 9/6/2024
4 cb5ca25f02bdf25c13edfede7931508bfd9e858f 6/19/2024
LAST_UPDATED_TIMESTAMP DUPLICATES POSTED EXPIRED DURATION \
0 2024-09-06 20:32:57.352 Z 0.0 6/2/2024 6/8/2024 6.0
1 2024-08-02 17:08:58.838 Z 0.0 6/2/2024 8/1/2024 NaN
2 2024-09-06 20:32:57.352 Z 1.0 6/2/2024 7/7/2024 35.0
3 2024-09-06 20:32:57.352 Z 1.0 6/2/2024 7/20/2024 48.0
4 2024-06-19 07:00:00.000 Z 0.0 6/2/2024 6/17/2024 15.0
SOURCE_TYPES SOURCES \
0 [\n "Company"\n] [\n "brassring.com"\n]
1 [\n "Job Board"\n] [\n "maine.gov"\n]
2 [\n "Job Board"\n] [\n "dejobs.org"\n]
3 [\n "Job Board"\n] [\n "disabledperson.com",\n "dejobs.org"\n]
4 [\n "FreeJobBoard"\n] [\n "craigslist.org"\n]
URL ... NAICS_2022_2 \
0 [\n "https://sjobs.brassring.com/TGnewUI/Sear... ... 44.0
1 [\n "https://joblink.maine.gov/jobs/1085740"\n] ... 56.0
2 [\n "https://dejobs.org/dallas-tx/data-analys... ... 52.0
3 [\n "https://www.disabledperson.com/jobs/5948... ... 52.0
4 [\n "https://modesto.craigslist.org/sls/77475... ... 99.0
NAICS_2022_2_NAME NAICS_2022_3 \
0 Retail Trade 441.0
1 Administrative and Support and Waste Managemen... 561.0
2 Finance and Insurance 524.0
3 Finance and Insurance 522.0
4 Unclassified Industry 999.0
NAICS_2022_3_NAME NAICS_2022_4 \
0 Motor Vehicle and Parts Dealers 4413.0
1 Administrative and Support Services 5613.0
2 Insurance Carriers and Related Activities 5242.0
3 Credit Intermediation and Related Activities 5221.0
4 Unclassified Industry 9999.0
NAICS_2022_4_NAME NAICS_2022_5 \
0 Automotive Parts, Accessories, and Tire Retailers 44133.0
1 Employment Services 56132.0
2 Agencies, Brokerages, and Other Insurance Rela... 52429.0
3 Depository Credit Intermediation 52211.0
4 Unclassified Industry 99999.0
NAICS_2022_5_NAME NAICS_2022_6 \
0 Automotive Parts and Accessories Retailers 441330.0
1 Temporary Help Services 561320.0
2 Other Insurance Related Activities 524291.0
3 Commercial Banking 522110.0
4 Unclassified Industry 999999.0
NAICS_2022_6_NAME
0 Automotive Parts and Accessories Retailers
1 Temporary Help Services
2 Claims Adjusting
3 Commercial Banking
4 Unclassified Industry
[5 rows x 131 columns]
# Take subset of relevant columns
relevant_columns = [
"SALARY", "MIN_YEARS_EXPERIENCE", "EDUCATION_LEVELS_NAME",
"EMPLOYMENT_TYPE_NAME", "REMOTE_TYPE_NAME", "DURATION",
"IS_INTERNSHIP", "COMPANY_IS_STAFFING", "STATE_NAME", "CITY_NAME",
"MSA_NAME", "ONET", "ONET_NAME", "NAICS2_NAME", "TITLE_NAME"
]
df_analysis = df[relevant_columns].copy()
df_analysis.head()| SALARY | MIN_YEARS_EXPERIENCE | EDUCATION_LEVELS_NAME | EMPLOYMENT_TYPE_NAME | REMOTE_TYPE_NAME | DURATION | IS_INTERNSHIP | COMPANY_IS_STAFFING | STATE_NAME | CITY_NAME | MSA_NAME | ONET | ONET_NAME | NAICS2_NAME | TITLE_NAME | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | 2.0 | [\n "Bachelor's degree"\n] | Full-time (> 32 hours) | [None] | 6.0 | False | False | Arkansas | El Dorado, AR | El Dorado, AR | 15-2051.01 | Business Intelligence Analysts | Retail Trade | Enterprise Analysts |
| 1 | NaN | 3.0 | [\n "No Education Listed"\n] | Full-time (> 32 hours) | Remote | NaN | False | True | Maine | Augusta, ME | Augusta-Waterville, ME | 15-2051.01 | Business Intelligence Analysts | Administrative and Support and Waste Managemen... | Oracle Consultants |
| 2 | NaN | 5.0 | [\n "Bachelor's degree"\n] | Full-time (> 32 hours) | [None] | 35.0 | False | False | Texas | Dallas, TX | Dallas-Fort Worth-Arlington, TX | 15-2051.01 | Business Intelligence Analysts | Finance and Insurance | Data Analysts |
| 3 | NaN | 3.0 | [\n "No Education Listed"\n] | Full-time (> 32 hours) | [None] | 48.0 | False | False | Arizona | Phoenix, AZ | Phoenix-Mesa-Chandler, AZ | 15-2051.01 | Business Intelligence Analysts | Finance and Insurance | Management Analysts |
| 4 | 92500.0 | NaN | [\n "No Education Listed"\n] | Part-time / full-time | [None] | 15.0 | False | False | California | Modesto, CA | Modesto, CA | 15-2051.01 | Business Intelligence Analysts | Unclassified Industry | Unclassified |
df_analysis["REMOTE_TYPE_NAME"] = df_analysis["REMOTE_TYPE_NAME"].astype(str).str.strip()
remote_col = df_analysis["REMOTE_TYPE_NAME"].replace({"[None]": None})
df_analysis["REMOTE_GROUP"] = np.select(
[
remote_col.eq("Remote"),
remote_col.eq("Hybrid Remote"),
remote_col.eq("Not Remote"),
remote_col.isna()
],
["Remote", "Hybrid", "Onsite", "Onsite"],
default="Onsite"
)
df_analysis.drop(columns=["REMOTE_TYPE_NAME"], inplace=True)
# EMPLOYMENT_GROUP
emp_col = df_analysis["EMPLOYMENT_TYPE_NAME"].astype(str).str.strip()
df_analysis["EMPLOYMENT_GROUP"] = np.select(
[
emp_col.eq("Full-time (> 32 hours)"),
emp_col.eq("Part-time (⤠32 hours)"),
emp_col.eq("Part-time / full-time"),
emp_col.isna()
],
["Full-time", "Part-time", "Flexible", "Full-time"],
default="Flexible"
)
df_analysis.drop(columns=["EMPLOYMENT_TYPE_NAME"], inplace=True)
# MIN_YEARS_EXPERIENCE & group
df_analysis["MIN_YEARS_EXPERIENCE"] = df_analysis["MIN_YEARS_EXPERIENCE"].fillna(0)
# make sure it is numerical
df_analysis["MIN_YEARS_EXPERIENCE"] = pd.to_numeric(
df_analysis["MIN_YEARS_EXPERIENCE"],
errors="coerce"
).fillna(0)
exp = df_analysis["MIN_YEARS_EXPERIENCE"]
df_analysis["MIN_YEARS_EXPERIENCE_GROUP"] = np.select(
[
(exp >= 0) & (exp <= 1),
(exp > 1) & (exp <= 3),
(exp > 3) & (exp <= 5),
(exp > 5) & (exp <= 10),
(exp > 10)
],
["Internship/Entry Level", "Junior", "Mid-Level", "Senior", "Expert"],
default="Internship/Entry Level"
)
# DURATION:null value & 0 -> 1
dur = df_analysis["DURATION"]
df_analysis["DURATION"] = (
dur.fillna(1)
.replace(0, 1)
)
# clean EDUCATION_LEVELS_NAME -> EDUCATION_LEVELS_CLEAN
edu_raw = df_analysis["EDUCATION_LEVELS_NAME"].fillna("")
edu_clean = (
edu_raw
.astype(str)
.str.replace(r"[\[\]\n]", "", regex=True)
.str.strip()
)
df_analysis["EDUCATION_LEVELS_CLEAN"] = edu_clean
df_analysis.drop(columns=["EDUCATION_LEVELS_NAME"], inplace=True)
# Fill in the blank STATE_NAME/CITY_NAME
df_analysis["STATE_NAME"] = df_analysis["STATE_NAME"].fillna("Unknown")
df_analysis["CITY_NAME"] = df_analysis["CITY_NAME"].fillna("Unknown")
# ONET/ONET_NAME/NAICS2_NAME null value handling
df_analysis["ONET"] = df_analysis["ONET"].fillna("00-0000.00")
df_analysis["ONET_NAME"] = df_analysis["ONET_NAME"].fillna("Unknown")
df_analysis["NAICS2_NAME"] = df_analysis["NAICS2_NAME"].fillna("Unknown")
print(df_analysis.head()) SALARY MIN_YEARS_EXPERIENCE DURATION IS_INTERNSHIP COMPANY_IS_STAFFING \
0 NaN 2.0 6.0 False False
1 NaN 3.0 1.0 False True
2 NaN 5.0 35.0 False False
3 NaN 3.0 48.0 False False
4 92500.0 0.0 15.0 False False
STATE_NAME CITY_NAME MSA_NAME ONET \
0 Arkansas El Dorado, AR El Dorado, AR 15-2051.01
1 Maine Augusta, ME Augusta-Waterville, ME 15-2051.01
2 Texas Dallas, TX Dallas-Fort Worth-Arlington, TX 15-2051.01
3 Arizona Phoenix, AZ Phoenix-Mesa-Chandler, AZ 15-2051.01
4 California Modesto, CA Modesto, CA 15-2051.01
ONET_NAME \
0 Business Intelligence Analysts
1 Business Intelligence Analysts
2 Business Intelligence Analysts
3 Business Intelligence Analysts
4 Business Intelligence Analysts
NAICS2_NAME TITLE_NAME \
0 Retail Trade Enterprise Analysts
1 Administrative and Support and Waste Managemen... Oracle Consultants
2 Finance and Insurance Data Analysts
3 Finance and Insurance Management Analysts
4 Unclassified Industry Unclassified
REMOTE_GROUP EMPLOYMENT_GROUP MIN_YEARS_EXPERIENCE_GROUP \
0 Onsite Full-time Junior
1 Remote Full-time Junior
2 Onsite Full-time Mid-Level
3 Onsite Full-time Junior
4 Onsite Flexible Internship/Entry Level
EDUCATION_LEVELS_CLEAN
0 "Bachelor's degree"
1 "No Education Listed"
2 "Bachelor's degree"
3 "No Education Listed"
4 "No Education Listed"
# Prepare binary classification data (Remote vs Onsite only)
df_binary = df_analysis[df_analysis["REMOTE_GROUP"].isin(["Remote", "Onsite"])].copy()
print(df_binary["REMOTE_GROUP"].value_counts())
#REMOTE_GROUP
Onsite 57741
Remote 12497
Name: count, dtype: int64
# Feature Engineering - Encode categorical variables
# Define categorical and numeric columns
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
categorical_cols = [
"EMPLOYMENT_GROUP",
"MIN_YEARS_EXPERIENCE_GROUP",
"EDUCATION_LEVELS_CLEAN",
"STATE_NAME",
"NAICS2_NAME"
]
numeric_cols = ["MIN_YEARS_EXPERIENCE", "DURATION"]
# 1. create label
label_encoder = LabelEncoder()
df_binary["label"] = label_encoder.fit_transform(df_binary["REMOTE_GROUP"])
# Take a look at the category order
print("Label mapping:", dict(zip(label_encoder.classes_,
label_encoder.transform(label_encoder.classes_))))
# 2. create features(indexer + onehot + VectorAssembler)
X = df_binary[categorical_cols + numeric_cols]
y = df_binary["label"]
# ColumnTransformer
preprocess = ColumnTransformer(
transformers=[
("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
("num", "passthrough", numeric_cols),
]
)
X_features = preprocess.fit_transform(X)
print("--- Prepared Data Preview (pandas + sklearn) ---")
print("X_features type:", type(X_features))
print("X_features shape:", X_features.shape)
print("First 5 labels:", y.iloc[:5].tolist())
print("First 5 REMOTE_GROUP:", df_binary["REMOTE_GROUP"].iloc[:5].tolist())
if hasattr(X_features, "toarray"):
X_dense = X_features.toarray()
else:
X_dense = np.asarray(X_features)
df_prepared = pd.DataFrame({
"REMOTE_GROUP": df_binary["REMOTE_GROUP"].reset_index(drop=True),
"label": y.reset_index(drop=True), # df_binary["label"]
"features": list(X_dense) # One for each line numpy array
})
print("\n--- df_prepared preview ---")
print(df_prepared.head())
print(df_prepared.shape)Label mapping: {'Onsite': np.int64(0), 'Remote': np.int64(1)}
--- Prepared Data Preview (pandas + sklearn) ---
X_features type: <class 'scipy.sparse._csr.csr_matrix'>
X_features shape: (70238, 112)
First 5 labels: [0, 1, 0, 0, 0]
First 5 REMOTE_GROUP: ['Onsite', 'Remote', 'Onsite', 'Onsite', 'Onsite']
--- df_prepared preview ---
REMOTE_GROUP label features
0 Onsite 0 [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
1 Remote 1 [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
2 Onsite 0 [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...
3 Onsite 0 [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
4 Onsite 0 [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
(70238, 3)
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(
df_prepared,
test_size=0.3, # 30% for test
random_state=42, # seed=42
)
print(f"Training set size: {len(train_data)}")
print(f"Test set size: {len(test_data)}")Training set size: 49166
Test set size: 21072
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# break out X / y
X_train = np.stack(train_data["features"].to_numpy()) # shape: (n_train, n_features)
y_train = train_data["label"].to_numpy()
X_test = np.stack(test_data["features"].to_numpy())
y_test = test_data["label"].to_numpy()
# Logistic Regression
lr_model = LogisticRegression(
max_iter=1000,
n_jobs=-1
)
lr_model.fit(X_train, y_train)
# Random Forest
rf_model = RandomForestClassifier(
n_estimators=100, # = numTrees
random_state=42, # = seed
n_jobs=-1
)
rf_model.fit(X_train, y_train)
print("Both models trained successfully!")Both models trained successfully!
10 Predict
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
# First extract X_test/y_test from test_data
X_test = np.stack(test_data["features"].to_numpy())
y_test = test_data["label"].to_numpy()
# Predict
# Logistic Regression
y_pred_lr = lr_model.predict(X_test)
# Obtain the positive class probability using predict_proba and calculate the AUC
y_proba_lr = lr_model.predict_proba(X_test)[:, 1]
# Random Forest
y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]
# Evaluation:Accuracy / F1 / AUC-ROC
from sklearn.metrics import f1_score
print("LOGISTIC REGRESSION RESULTS")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"F1 Score (weighted): {f1_score(y_test, y_pred_lr, average='weighted'):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba_lr):.4f}")
print("RANDOM FOREST RESULTS")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"F1 Score (weighted): {f1_score(y_test, y_pred_rf, average='weighted'):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba_rf):.4f}")LOGISTIC REGRESSION RESULTS
Accuracy: 0.8250
F1 Score (weighted): 0.7533
AUC-ROC: 0.6368
RANDOM FOREST RESULTS
Accuracy: 0.8349
F1 Score (weighted): 0.8168
AUC-ROC: 0.7293
# Step 7: Confusion Matrix
from sklearn.metrics import confusion_matrix
print("\n=== Logistic Regression Confusion Matrix (2x2) ===")
print("(rows = true label, cols = predicted label)")
cm_lr = confusion_matrix(y_test, y_pred_lr, labels=[0, 1])
cm_lr_df = pd.DataFrame(
cm_lr,
index=["True 0 (Onsite)", "True 1 (Remote)"],
columns=["Pred 0 (Onsite)", "Pred 1 (Remote)"]
)
print(cm_lr_df)
print("\n=== Random Forest Confusion Matrix (2x2) ===")
cm_rf = confusion_matrix(y_test, y_pred_rf, labels=[0, 1])
cm_rf_df = pd.DataFrame(
cm_rf,
index=["True 0 (Onsite)", "True 1 (Remote)"],
columns=["Pred 0 (Onsite)", "Pred 1 (Remote)"]
)
print(cm_rf_df)
=== Logistic Regression Confusion Matrix (2x2) ===
(rows = true label, cols = predicted label)
Pred 0 (Onsite) Pred 1 (Remote)
True 0 (Onsite) 17274 59
True 1 (Remote) 3628 111
=== Random Forest Confusion Matrix (2x2) ===
Pred 0 (Onsite) Pred 1 (Remote)
True 0 (Onsite) 16372 961
True 1 (Remote) 2517 1222
11 confusion matrix
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Calculate the confusion matrix (in the order of [0, 1])
cm_lr = confusion_matrix(y_test, y_pred_lr, labels=[0, 1])
cm_rf = confusion_matrix(y_test, y_pred_rf, labels=[0, 1])
print("Logistic Regression CM:\n", cm_lr)
print("Random Forest CM:\n", cm_rf)
# The z (list of list) required for converting to plotly
lr_z = cm_lr.tolist()
rf_z = cm_rf.tolist()
lr_x = ['Onsite', 'Remote'] # Predicted label
lr_y = ['Onsite', 'Remote'] # True label
# Visual
fig = make_subplots(
rows=1, cols=2,
subplot_titles=('Logistic Regression', 'Random Forest')
)
# Logistic Regression heatmap
fig.add_trace(
go.Heatmap(
z=lr_z,
x=lr_x,
y=lr_y,
colorscale='Blues',
text=lr_z,
texttemplate="%{text}",
showscale=False
),
row=1, col=1
)
# Random Forest heatmap
fig.add_trace(
go.Heatmap(
z=rf_z,
x=lr_x,
y=lr_y,
colorscale='Blues',
text=rf_z,
texttemplate="%{text}",
showscale=True
),
row=1, col=2
)
fig.update_layout(
title_text="Remote vs Onsite Job Classification - Confusion Matrix Comparison",
title_font_size=16,
height=400,
width=900
)
fig.update_xaxes(title_text="Predicted Label")
fig.update_yaxes(title_text="True Label")
fig.write_image("figures/confusion_matrix_comparison.jpg")Logistic Regression CM:
[[17274 59]
[ 3628 111]]
Random Forest CM:
[[16372 961]
[ 2517 1222]]
12 KMeans Clustering using ONET as reference label
# KMeans Clustering using ONET as reference label
# Check ONET distribution first
onet_dist_df = (
df_analysis["ONET_NAME"]
.value_counts()
.reset_index()
.rename(columns={"index": "ONET_NAME", "ONET_NAME": "count"})
)
print(onet_dist_df.head(20)) count count
0 Business Intelligence Analysts 72454
1 Unknown 44
13 Prepare features for KMeans clustering
# Prepare features for KMeans clustering
# Define columns
cluster_categorical_cols = [
"EMPLOYMENT_GROUP",
"MIN_YEARS_EXPERIENCE_GROUP",
"EDUCATION_LEVELS_CLEAN",
"STATE_NAME",
"REMOTE_GROUP",
]
cluster_numeric_cols = ["MIN_YEARS_EXPERIENCE", "DURATION"]
# Prepare the feature table X (only including the columns used for clustering)
X_cluster = df_analysis[cluster_categorical_cols + cluster_numeric_cols]
# 3. ColumnTransformer = StringIndexer + OneHotEncoder + VectorAssembler
cluster_preprocess = ColumnTransformer(
transformers=[
("cat", OneHotEncoder(handle_unknown="ignore"), cluster_categorical_cols),
("num", "passthrough", cluster_numeric_cols),
]
)
cluster_pipeline = Pipeline(
steps=[
("preprocess", cluster_preprocess)
]
)
# Fit and merge to generate the features matrix
X_cluster_features = cluster_pipeline.fit_transform(X_cluster)
if hasattr(X_cluster_features, "toarray"):
X_cluster_dense = X_cluster_features.toarray()
else:
X_cluster_dense = np.asarray(X_cluster_features)
print("--- Clustering Features Shape ---")
print(X_cluster_dense.shape)
# 5. ONET_NAME -> ONET_label
onet_le = LabelEncoder()
df_analysis = df_analysis.copy()
df_analysis["ONET_NAME"] = df_analysis["ONET_NAME"].fillna("Unknown")
df_analysis["ONET_label"] = onet_le.fit_transform(df_analysis["ONET_NAME"])
print("ONET label mapping (first few):")
for name, idx in list(zip(onet_le.classes_, range(len(onet_le.classes_))))[:10]:
print(idx, "->", name)
# 6. group df_cluster:ONET_NAME | ONET_label | features
df_cluster = df_analysis.copy()
df_cluster["features"] = list(X_cluster_dense)
print("--- Clustering Data Prepared (pandas) ---")
print(df_cluster[["ONET_NAME", "ONET_label", "features"]].head())
print(df_cluster.shape)--- Clustering Features Shape ---
(72498, 94)
ONET label mapping (first few):
0 -> Business Intelligence Analysts
1 -> Unknown
--- Clustering Data Prepared (pandas) ---
ONET_NAME ONET_label \
0 Business Intelligence Analysts 0
1 Business Intelligence Analysts 0
2 Business Intelligence Analysts 0
3 Business Intelligence Analysts 0
4 Business Intelligence Analysts 0
features
0 [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
1 [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
2 [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...
3 [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
4 [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
(72498, 18)
14 Find optimal K using Elbow Method
# Find optimal K using Elbow Method
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
# Extract the feature matrix from the df_cluster
X_cluster = np.stack(df_cluster["features"].to_numpy()) # shape: (n_samples, n_features)
k_values = [3, 5, 7, 10, 15, 20]
silhouette_scores = []
print("--- Finding Optimal K ---")
for k in k_values:
kmeans = KMeans(
n_clusters=k,
random_state=42, # seed = 42
n_init="auto"
)
labels = kmeans.fit_predict(X_cluster)
score = silhouette_score(X_cluster, labels)
silhouette_scores.append(score)
print(f"K = {k}: Silhouette Score = {score:.4f}")--- Finding Optimal K ---
K = 3: Silhouette Score = 0.5132
K = 5: Silhouette Score = 0.4411
K = 7: Silhouette Score = 0.3804
K = 10: Silhouette Score = 0.3590
K = 15: Silhouette Score = 0.3334
K = 20: Silhouette Score = 0.2926
15 Extract the feature matrix
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
# Extract the feature matrix
X_cluster = np.stack(df_cluster["features"].to_numpy()) # shape: (n_samples, n_features)
# Train the final KMeans model
optimal_k = 3
kmeans_final = KMeans(
n_clusters=optimal_k,
random_state=42, # seed = 42
n_init="auto"
)
cluster_labels = kmeans_final.fit_predict(X_cluster)
# Write the cluster back to the DataFrame
df_clustered = df_cluster.copy()
df_clustered["cluster"] = cluster_labels
# 4. calculate silhouette
final_score = silhouette_score(X_cluster, cluster_labels)
print(f"\n--- Final KMeans Model (K={optimal_k}) ---")
print(f"Silhouette Score: {final_score:.4f}")
# Cluster Distribution
print("\n--- Cluster Distribution ---")
print(
df_clustered["cluster"]
.value_counts()
.sort_index()
)
--- Final KMeans Model (K=3) ---
Silhouette Score: 0.5132
--- Cluster Distribution ---
cluster
0 19473
1 39891
2 13134
Name: count, dtype: int64
16 Analyze clusters vs ONET reference labels
# Cross-tabulation of clusters and ONET
print("--- Top ONET categories in each cluster ---")
for cluster_id in range(optimal_k):
print(f"\n=== Cluster {cluster_id} ===")
top_onet_df = (
df_clustered[df_clustered["cluster"] == cluster_id]["ONET_NAME"]
.value_counts()
.reset_index()
.rename(columns={"index": "ONET_NAME", "ONET_NAME": "count"})
.head(5)
)
print(top_onet_df)--- Top ONET categories in each cluster ---
=== Cluster 0 ===
count count
0 Business Intelligence Analysts 19472
1 Unknown 1
=== Cluster 1 ===
count count
0 Business Intelligence Analysts 39851
1 Unknown 40
=== Cluster 2 ===
count count
0 Business Intelligence Analysts 13131
1 Unknown 3
17 Analyze cluster characteristics
print("--- Cluster Characteristics ---")
# === Remote work distribution by cluster ===
print("\n=== Remote Work by Cluster ===")
remote_by_cluster = (
df_clustered
.groupby(["cluster", "REMOTE_GROUP"])
.size()
.reset_index(name="count")
.sort_values(["cluster", "REMOTE_GROUP"])
)
print(remote_by_cluster)
remote_pivot = remote_by_cluster.pivot(
index="cluster",
columns="REMOTE_GROUP",
values="count"
).fillna(0).astype(int)
print("\nRemote work pivot table:")
print(remote_pivot)
# === Experience level by cluster ===
print("\n=== Experience Level by Cluster ===")
exp_by_cluster = (
df_clustered
.groupby(["cluster", "MIN_YEARS_EXPERIENCE_GROUP"])
.size()
.reset_index(name="count")
.sort_values(["cluster", "MIN_YEARS_EXPERIENCE_GROUP"])
)
print(exp_by_cluster)
exp_pivot = exp_by_cluster.pivot(
index="cluster",
columns="MIN_YEARS_EXPERIENCE_GROUP",
values="count"
).fillna(0).astype(int)
print("\nExperience level pivot table:")
print(exp_pivot)
# === Top states by cluster ===
print("\n=== Top States by Cluster ===")
for cluster_id in range(optimal_k):
print(f"\n--- Cluster {cluster_id} Top States ---")
top_states = (
df_clustered[df_clustered["cluster"] == cluster_id]["STATE_NAME"]
.value_counts()
.head(5)
)
print(top_states)--- Cluster Characteristics ---
=== Remote Work by Cluster ===
cluster REMOTE_GROUP count
0 0 Hybrid 540
1 0 Onsite 15203
2 0 Remote 3730
3 1 Hybrid 1177
4 1 Onsite 32518
5 1 Remote 6196
6 2 Hybrid 543
7 2 Onsite 10020
8 2 Remote 2571
Remote work pivot table:
REMOTE_GROUP Hybrid Onsite Remote
cluster
0 540 15203 3730
1 1177 32518 6196
2 543 10020 2571
=== Experience Level by Cluster ===
cluster MIN_YEARS_EXPERIENCE_GROUP count
0 0 Expert 922
1 0 Internship/Entry Level 7249
2 0 Junior 3628
3 0 Mid-Level 3894
4 0 Senior 3780
5 1 Expert 1847
6 1 Internship/Entry Level 14474
7 1 Junior 7179
8 1 Mid-Level 7681
9 1 Senior 8710
10 2 Expert 705
11 2 Internship/Entry Level 4953
12 2 Junior 2438
13 2 Mid-Level 2348
14 2 Senior 2690
Experience level pivot table:
MIN_YEARS_EXPERIENCE_GROUP Expert Internship/Entry Level Junior Mid-Level \
cluster
0 922 7249 3628 3894
1 1847 14474 7179 7681
2 705 4953 2438 2348
MIN_YEARS_EXPERIENCE_GROUP Senior
cluster
0 3780
1 8710
2 2690
=== Top States by Cluster ===
--- Cluster 0 Top States ---
STATE_NAME
Texas 2098
California 2041
Florida 1069
Virginia 1000
New York 801
Name: count, dtype: int64
--- Cluster 1 Top States ---
STATE_NAME
Texas 4519
California 3839
Illinois 2417
Virginia 2082
New York 1969
Name: count, dtype: int64
--- Cluster 2 Top States ---
STATE_NAME
Texas 1450
California 1204
Florida 636
New York 571
Virginia 554
Name: count, dtype: int64
18 Visualization: Elbow Plot
import pandas as pd
import plotly.express as px
# obtain the cluster frequency
cluster_counts_series = (
df_clustered["cluster"]
.value_counts()
.sort_index()
)
# Clearly create a DataFrame with only two columns, "cluster" and "count"
cluster_counts = pd.DataFrame({
"cluster": cluster_counts_series.index,
"count": cluster_counts_series.values
})
print(cluster_counts) # Check if the column names are ['cluster', 'count']
# Visual
fig = px.bar(
cluster_counts,
x="cluster",
y="count",
title="KMeans Clustering: Distribution of Jobs Across Clusters",
labels={"cluster": "Cluster", "count": "Number of Jobs"},
template="plotly_white",
color="count",
color_continuous_scale="Blues",
)
fig.update_layout(font=dict(family="Roboto", size=14))
fig.write_image("figures/kmeans_elbow_plot.jpg")
fig cluster count
0 0 19473
1 1 39891
2 2 13134
19 Visualization: Cluster Distribution
import pandas as pd
import plotly.express as px
# Visualization: Cluster Distribution
# Get cluster counts(
mapping = {0: 2, 1: 0, 2: 1}
df_clustered["cluster_spark"] = df_clustered["cluster"].map(mapping)
cluster_counts = (
df_clustered["cluster_spark"]
.value_counts()
.sort_index()
)
fig = px.bar(
x=cluster_counts.index,
y=cluster_counts.values,
color=cluster_counts.values,
color_continuous_scale="Blues",
labels={"x": "Cluster", "y": "Number of Jobs", "color": "Number of Jobs"},
title="KMeans Clustering: Distribution of Jobs Across Clusters",
template="plotly_white",
)
fig.update_layout(font=dict(family="Roboto", size=14))
fig.write_image("figures/kmeans_cluster_distribution.jpg")
fig20 Choropleth Map: Remote Work Percentage by State
import pandas as pd
import numpy as np
# Calculate remote work percentage by state (pandas version)
# 1.STATE_NAME & REMOTE_GROUP
state_remote = (
df_analysis
.groupby(["STATE_NAME", "REMOTE_GROUP"])
.size()
.reset_index(name="count")
)
# 2. pivot:index = STATE_NAME,columns = REMOTE_GROUP(Onsite / Remote / Hybrid)
state_df = (
state_remote
.pivot(index="STATE_NAME", columns="REMOTE_GROUP", values="count")
.fillna(0)
.reset_index()
)
# Ensure that the "Hybrid" column exists
if "Hybrid" not in state_df.columns:
state_df["Hybrid"] = 0
# Calculate Total and Remote_Pct
state_df["Total"] = state_df["Onsite"] + state_df["Remote"] + state_df["Hybrid"]
state_df["Remote_Pct"] = (state_df["Remote"] / state_df["Total"] * 100).round(2)
print("--- State Remote Work Data ---")
print(state_df.head(10))--- State Remote Work Data ---
REMOTE_GROUP STATE_NAME Hybrid Onsite Remote Total Remote_Pct
0 Alabama 15.0 567.0 108.0 690.0 15.65
1 Alaska 7.0 132.0 97.0 236.0 41.10
2 Arizona 43.0 1349.0 246.0 1638.0 15.02
3 Arkansas 12.0 464.0 108.0 584.0 18.49
4 California 262.0 5766.0 1056.0 7084.0 14.91
5 Colorado 55.0 1141.0 259.0 1455.0 17.80
6 Connecticut 41.0 660.0 162.0 863.0 18.77
7 Delaware 15.0 330.0 93.0 438.0 21.23
8 Florida 72.0 3060.0 513.0 3645.0 14.07
9 Georgia 64.0 2169.0 425.0 2658.0 15.99
21 Add state abbreviations for Plotly map
# State name to abbreviation mapping
state_abbrev = {
'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO',
'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH',
'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT',
'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY',
'District of Columbia': 'DC'
}
# Add state abbreviation column
state_df['State_Abbrev'] = state_df['STATE_NAME'].map(state_abbrev)
# Remove rows without valid state abbreviation (e.g., "Unknown")
state_df_clean = state_df[state_df['State_Abbrev'].notna()]
print(f"States with data: {len(state_df_clean)}")
print(state_df_clean[['STATE_NAME', 'State_Abbrev', 'Total', 'Remote_Pct']].head(10))States with data: 50
REMOTE_GROUP STATE_NAME State_Abbrev Total Remote_Pct
0 Alabama AL 690.0 15.65
1 Alaska AK 236.0 41.10
2 Arizona AZ 1638.0 15.02
3 Arkansas AR 584.0 18.49
4 California CA 7084.0 14.91
5 Colorado CO 1455.0 17.80
6 Connecticut CT 863.0 18.77
7 Delaware DE 438.0 21.23
8 Florida FL 3645.0 14.07
9 Georgia GA 2658.0 15.99
22 Choropleth Map with State Labels showing Remote Percentage
import plotly.graph_objects as go
# Choropleth Map with State Labels showing Remote Percentage
fig = go.Figure(data=go.Choropleth(
locations=state_df_clean['State_Abbrev'],
z=state_df_clean['Remote_Pct'],
locationmode='USA-states',
colorscale='Blues',
colorbar_title='Remote %',
text=state_df_clean['State_Abbrev'] + '<br>' + state_df_clean['Remote_Pct'].astype(str) + '%',
hovertemplate='<b>%{text}</b><br>Total Jobs: %{customdata[0]}<br>Remote Jobs: %{customdata[1]}<extra></extra>',
customdata=state_df_clean[['Total', 'Remote']].values,
marker_line_color='white',
marker_line_width=1
))
# Add state abbreviations with percentages as annotations
fig.add_scattergeo(
locations=state_df_clean['State_Abbrev'],
locationmode='USA-states',
text=state_df_clean['Remote_Pct'].apply(lambda x: f'{x:.0f}%'),
mode='text',
textfont=dict(size=8, color='black', family='Arial Black'),
showlegend=False
)
fig.update_layout(
title_text='Remote Work Opportunity by State (% of Jobs that are Remote)',
title_font_size=18,
geo=dict(
scope='usa',
projection_type='albers usa',
showlakes=True,
lakecolor='rgb(255, 255, 255)',
bgcolor='rgba(0,0,0,0)'
),
template='plotly_white',
font=dict(family="Roboto", size=14),
height=600,
width=1000
)
fig.write_image("figures/choropleth_remote_work_with_labels.jpg")
print("Enhanced choropleth map saved!")
figEnhanced choropleth map saved!
23 Choropleth Map: Total Job Postings by State (with labels)
import plotly.graph_objects as go
# Choropleth Map: Total Job Postings by State (with labels)
# Get top 15 states by total jobs for labeling
top_states = state_df_clean.nlargest(15, 'Total')['State_Abbrev'].tolist()
fig = go.Figure(data=go.Choropleth(
locations=state_df_clean['State_Abbrev'],
z=state_df_clean['Total'],
locationmode='USA-states',
colorscale='Greens',
colorbar_title='Total Jobs',
hovertemplate='<b>%{location}</b><br>Total Jobs: %{z:,}<br>Remote: %{customdata[0]:,}<br>Onsite: %{customdata[1]:,}<extra></extra>',
customdata=state_df_clean[['Remote', 'Onsite']].values,
marker_line_color='white',
marker_line_width=1.5
))
# Add labels for top states (format large numbers with K)
top_state_df = state_df_clean[state_df_clean['State_Abbrev'].isin(top_states)].copy()
top_state_df['Total_Label'] = top_state_df['Total'].apply(
lambda x: f'{x/1000:.1f}K' if x >= 1000 else str(int(x))
)
fig.add_scattergeo(
locations=top_state_df['State_Abbrev'],
locationmode='USA-states',
text=top_state_df['Total_Label'],
mode='text',
textfont=dict(size=10, color='darkgreen', family='Arial Black'),
showlegend=False
)
fig.update_layout(
title_text='Total Job Postings by State<br><sup>Labels shown for top 15 states by job volume</sup>',
title_font_size=16,
geo=dict(
scope='usa',
projection_type='albers usa',
showlakes=True,
lakecolor='rgb(255, 255, 255)'
),
template='plotly_white',
font=dict(family="Roboto", size=14),
height=600,
width=1000
)
fig.write_image("figures/choropleth_total_jobs_with_labels.jpg")
print("Enhanced total jobs choropleth map saved!")
figEnhanced total jobs choropleth map saved!
24 Bar Chart: Top 10 States by Remote Work Percentage
# Filter states with at least 100 jobs for meaningful comparison
state_df_filtered = state_df_clean[state_df_clean['Total'] >= 100]
# Sort by remote percentage
top_remote_states = state_df_filtered.nlargest(10, 'Remote_Pct')
fig = px.bar(
top_remote_states,
x='STATE_NAME',
y='Remote_Pct',
color='Remote_Pct',
color_continuous_scale='Blues',
title='Top 10 States with Highest Remote Work Opportunities',
labels={'STATE_NAME': 'State', 'Remote_Pct': 'Remote Work %'},
text='Remote_Pct'
)
fig.update_traces(texttemplate='%{text:.1f}%', textposition='outside')
fig.update_layout(
template='plotly_white',
font=dict(family="Roboto", size=14),
xaxis_tickangle=-45,
showlegend=False
)
fig.write_image("figures/top10_remote_states.jpg")
print("Top 10 remote states bar chart saved!")
figTop 10 remote states bar chart saved!